---
title: "Process_harmonized"
author: "TMC"
date: '2023-04-25'
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
#Extrcact mRNA list by RNA types

load(file="acc.rda")
df <- assays(ACC)$tpm_unstrand
RNAgenes <- rownames(df)

mRNAi <- elementMetadata(ACC)$gene_type=="protein_coding"  #mRNA
miRNAi <- elementMetadata(ACC)$gene_type=="miRNA"          #miRNA
lncRNAi <- elementMetadata(ACC)$gene_type=="lncRNA"        #lncRNA

dfRNAm <- df[mRNAi,]
mRNAgenes <- rownames(dfRNAm)
dfRNAmi <- df[miRNAi,]
miRNAgenes <- rownames(dfRNAmi)
dfRNAlnc <- df[lncRNAi,]
lncRNAgenes <- rownames(dfRNAlnc)

sRNAgenes <- union(mRNAgenes,miRNAgenes)
sRNAgenes <- union(sRNAgenes,lncRNAgenes)

rRNAgenes <- setdiff(RNAgenes,sRNAgenes)                  #otherRNA

rm(mRNAi,miRNAi,lncRNAi)
rm(df,dfRNAm,dfRNAmi,dfRNAlnc,sRNAgenes,RNAgenes)
rm(ACC)
```

```{r}

projects<-c("ACC","BLCA","BRCA","CESC","CHOL","COAD","DLBC","ESCA","GBM","HNSC","KICH","KIRC","KIRP","LAML","LGG","LIHC","LUAD","LUSC","MESO","OV","PAAD","PCPG","PRAD","READ","SARC","SKCM","STAD","TGCT","THCA","THYM","UCEC","UCS","UVM")
project_files <- c("acc.rda","blca.rda","brca.rda","cesc.rda","chol.rda","coad.rda","dlbc.rda","esca.rda","gbm.rda","hnsc.rda","kich.rda","kirc.rda","kirp.rda","laml.rda","lgg.rda","lihc.rda","luad.rda","lusc.rda","meso.rda","ov.rda","paad.rda","pcpg.rda","prad.rda","read.rda","sarc.rda","skcm.rda","stad.rda","tgct.rda","thca.rda","thym.rda","ucec.rda","ucs.rda","uvm.rda")

if(exists("RNAsetT")) rm(RNAsetT)

if(exists("vt")) rm(vt)

flagt =0;

classCnt <- as.data.frame(matrix(nrow=0,ncol=4))
colnames(classCnt) <- c("Total","Removed","Normal","Tumor")

cnt =length(projects)
for(val in 1:cnt){
removedN <- totalN <- scntN <- scntT <- integer()
  
project_file <- project_files[val]
project_name <- projects[val]
load(file=paste(project_file))
se <- get(project_name)

totalN <- ncol(se)
scntN <- 0

keep <- which(colData(se)$sample_type == "Primary Tumor")
if(isEmpty(keep)) 
  keep <- which(colData(se)$sample_type == "Primary Blood Derived Cancer - Peripheral Blood")

if(isEmpty(keep)) scntT <- 0 else scntT <-length(keep)
if(!isEmpty(keep)){
set <- se[,keep]

# We have taken fpkm normalized expression values using fpkm_unstrand assay for this dataset. Other optional assays are unstranded, stranded_first, stranded_second, tpm_unstrand, fpkm_uq_unstrand. For example, in case of Differential expression analysis you have to use unstranded assay expression values.
setdf <- assays(set)$fpkm_unstrand

RNAt <- as.matrix(setdf)
colnames(RNAt) <- colnames(setdf)

if(flagt==0) {
RNAsetT <- RNAt
vt <- rep(flagt,ncol(RNAt))
classcountT <- ncol(RNAt)
}
else{
RNAsetT <- cbind(RNAsetT,RNAt)  
vt <- c(vt,rep(flagt,ncol(RNAt)))
classcountT <- c(classcountT,ncol(RNAt))
}
flagt = flagt+1
rm(RNAt)
}  

print(c(val,project_name,ncol(RNAt)," done..."))
} 

labels <- vt
RNAset <- RNAsetT


# Transpose dataset to set features as column headers
RNAset <- t(RNAset)

# Remove features from dataset with mean expression value < 0.05
RNAset_filter <- RNAset[,which((colMeans(RNAset) >=0.05) == TRUE)]

RNAgenes_filter = colnames(RNAset_filter)

rRNAgenes_filter = intersect(rRNAgenes,RNAgenes_filter)
mRNAgenes_filter = intersect(mRNAgenes,RNAgenes_filter)
miRNAgenes_filter = intersect(miRNAgenes,RNAgenes_filter)
lncRNAgenes_filter = intersect(lncRNAgenes,RNAgenes_filter)

# Partition dataset as per RNA types
mRNAset_filter <- RNAset[,mRNAgenes_filter]
miRNAset_filter <- RNAset[,miRNAgenes_filter]
lncRNAset_filter <- RNAset[,lncRNAgenes_filter]
othRNAset_filter <- RNAset[,rRNAgenes_filter]

# Save labels and other partitioned dataset as csv file. Create a folder tcga33tumors first.
write.table(labels,"tcga33tumors/labels.csv",sep=",",col.names=FALSE,row.names=FALSE)
write.csv(mRNAset_filter,"tcga33tumors/data-mRNA.csv")
write.csv(miRNAset_filter,"tcga33tumors/data-miRNA.csv")
write.csv(lncRNAset_filter,"tcga33tumors/data-lncRNA.csv")
write.csv(othRNAset_filter,"tcga33tumors/data-othRNA.csv")
write.csv(RNAset_filter,"tcga33tumors/data.csv")

# Remove variables
rm(val,se,set,setdf,keep,flagt,projects,project_files)
rm(ACC,BLCA,BRCA,CESC,CHOL,COAD,DLBC,ESCA,GBM,HNSC,KICH,KIRC,KIRP,LAML,LGG,LIHC,LUAD,LUSC,MESO,OV,PAAD,PCPG,PRAD,READ,SARC,SKCM,STAD,TGCT,THCA,THYM,UCEC,UCS,UVM)

```